#第七章 数据清洗

from utils.commonUtil import get_bs4_obj
import re
from collections import OrderedDict

#语义分词 针对英文

def ngrams(inputStr,n):
    inputStr = re.sub("\n+| +"," ",inputStr) #去掉空格和换行符
    inputStr = bytes(inputStr,'utf-8')
    inputStr = inputStr.decode("ascii",'ignore')
    inputData = inputStr.split(" ")
    
    num = int(len(inputData)/60)
    output = []
    for i in range(num-n+1):
        output.append(inputData[i:i+n])
    return output

'''
url = "http://en.wikipedia.org/wiki/Python_(programming_language)"
bs4 = get_bs4_obj(url)
htmlText = bs4.find("div",{"id":'mw-content-text'}).get_text()
ngrams_rets = ngrams(htmlText,2)
print(len(ngrams_rets),ngrams_rets) 
'''

od = OrderedDict([
    ('a',1),
    ('b',2),
    ('c',3),
    ('d',4),
])

